Data Munging

Data loading and preprocessing with pandas

Fast and easy data loading


In [4]:
import pandas as pd
iris_filename = 'datasets-uci-iris.csv'
iris = pd.read_csv(iris_filename, sep=',', decimal='.', header=None,
names= ['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
'target'])

In [5]:
# If the dataset is not available online, you can follow these steps to
# download it from the Internet:
try:
    import urllib.request as urllib2
except ImportError:
    import urllib2
#import urllib2
url = "http://aima.cs.berkeley.edu/data/iris.csv"
set1 = urllib2.Request(url)
iris_p = urllib2.urlopen(set1)
iris_other = pd.read_csv(iris_p, sep=',', decimal='.',
header=None, names= ['sepal_length', 'sepal_width',
'petal_length', 'petal_width', 'target'])
iris_other.head()


Out[5]:
sepal_length sepal_width petal_length petal_width target
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa

In [6]:
iris.head()


Out[6]:
sepal_length sepal_width petal_length petal_width target
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa

In [7]:
iris.tail()


Out[7]:
sepal_length sepal_width petal_length petal_width target
145 6.7 3.0 5.2 2.3 Iris-virginica
146 6.3 2.5 5.0 1.9 Iris-virginica
147 6.5 3.0 5.2 2.0 Iris-virginica
148 6.2 3.4 5.4 2.3 Iris-virginica
149 5.9 3.0 5.1 1.8 Iris-virginica

In [8]:
iris.head(2)


Out[8]:
sepal_length sepal_width petal_length petal_width target
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa

In [9]:
iris.columns


Out[9]:
Index([u'sepal_length', u'sepal_width', u'petal_length', u'petal_width',
       u'target'],
      dtype='object')

In [10]:
Y = iris['target']
Y


Out[10]:
0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
5         Iris-setosa
6         Iris-setosa
7         Iris-setosa
8         Iris-setosa
9         Iris-setosa
10        Iris-setosa
11        Iris-setosa
12        Iris-setosa
13        Iris-setosa
14        Iris-setosa
15        Iris-setosa
16        Iris-setosa
17        Iris-setosa
18        Iris-setosa
19        Iris-setosa
20        Iris-setosa
21        Iris-setosa
22        Iris-setosa
23        Iris-setosa
24        Iris-setosa
25        Iris-setosa
26        Iris-setosa
27        Iris-setosa
28        Iris-setosa
29        Iris-setosa
            ...      
120    Iris-virginica
121    Iris-virginica
122    Iris-virginica
123    Iris-virginica
124    Iris-virginica
125    Iris-virginica
126    Iris-virginica
127    Iris-virginica
128    Iris-virginica
129    Iris-virginica
130    Iris-virginica
131    Iris-virginica
132    Iris-virginica
133    Iris-virginica
134    Iris-virginica
135    Iris-virginica
136    Iris-virginica
137    Iris-virginica
138    Iris-virginica
139    Iris-virginica
140    Iris-virginica
141    Iris-virginica
142    Iris-virginica
143    Iris-virginica
144    Iris-virginica
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: target, dtype: object

In [11]:
X = iris[['sepal_length', 'sepal_width']]
X


Out[11]:
sepal_length sepal_width
0 5.1 3.5
1 4.9 3.0
2 4.7 3.2
3 4.6 3.1
4 5.0 3.6
5 5.4 3.9
6 4.6 3.4
7 5.0 3.4
8 4.4 2.9
9 4.9 3.1
10 5.4 3.7
11 4.8 3.4
12 4.8 3.0
13 4.3 3.0
14 5.8 4.0
15 5.7 4.4
16 5.4 3.9
17 5.1 3.5
18 5.7 3.8
19 5.1 3.8
20 5.4 3.4
21 5.1 3.7
22 4.6 3.6
23 5.1 3.3
24 4.8 3.4
25 5.0 3.0
26 5.0 3.4
27 5.2 3.5
28 5.2 3.4
29 4.7 3.2
... ... ...
120 6.9 3.2
121 5.6 2.8
122 7.7 2.8
123 6.3 2.7
124 6.7 3.3
125 7.2 3.2
126 6.2 2.8
127 6.1 3.0
128 6.4 2.8
129 7.2 3.0
130 7.4 2.8
131 7.9 3.8
132 6.4 2.8
133 6.3 2.8
134 6.1 2.6
135 7.7 3.0
136 6.3 3.4
137 6.4 3.1
138 6.0 3.0
139 6.9 3.1
140 6.7 3.1
141 6.9 3.1
142 5.8 2.7
143 6.8 3.2
144 6.7 3.3
145 6.7 3.0
146 6.3 2.5
147 6.5 3.0
148 6.2 3.4
149 5.9 3.0

150 rows × 2 columns


In [12]:
X.shape


Out[12]:
(150, 2)

In [13]:
Y.shape


Out[13]:
(150,)

Dealing with problematic data


In [14]:
import pandas as pd
fake_dataset = pd.read_csv('a_loading_example_1.csv', sep=',')
fake_dataset


Out[14]:
Date Temperature_city_1 Temperature_city_2 Temperature_city_3 Which_destination
0 20140910 80.0 32.0 40 1
1 20140911 100.0 50.0 36 2
2 20140912 102.0 55.0 46 1
3 20140913 60.0 20.0 35 3
4 20140914 60.0 NaN 32 3
5 20140915 NaN 57.0 42 2

In [15]:
fake_dataset = pd.read_csv('a_loading_example_1.csv',
parse_dates=[0])
fake_dataset


Out[15]:
Date Temperature_city_1 Temperature_city_2 Temperature_city_3 Which_destination
0 2014-09-10 80.0 32.0 40 1
1 2014-09-11 100.0 50.0 36 2
2 2014-09-12 102.0 55.0 46 1
3 2014-09-13 60.0 20.0 35 3
4 2014-09-14 60.0 NaN 32 3
5 2014-09-15 NaN 57.0 42 2

In [16]:
fake_dataset.fillna(50)


Out[16]:
Date Temperature_city_1 Temperature_city_2 Temperature_city_3 Which_destination
0 2014-09-10 80.0 32.0 40 1
1 2014-09-11 100.0 50.0 36 2
2 2014-09-12 102.0 55.0 46 1
3 2014-09-13 60.0 20.0 35 3
4 2014-09-14 60.0 50.0 32 3
5 2014-09-15 50.0 57.0 42 2

In [17]:
fake_dataset.fillna(-1)


Out[17]:
Date Temperature_city_1 Temperature_city_2 Temperature_city_3 Which_destination
0 2014-09-10 80.0 32.0 40 1
1 2014-09-11 100.0 50.0 36 2
2 2014-09-12 102.0 55.0 46 1
3 2014-09-13 60.0 20.0 35 3
4 2014-09-14 60.0 -1.0 32 3
5 2014-09-15 -1.0 57.0 42 2

In [18]:
fake_dataset.fillna(fake_dataset.mean(axis=0))


Out[18]:
Date Temperature_city_1 Temperature_city_2 Temperature_city_3 Which_destination
0 2014-09-10 80.0 32.0 40 1
1 2014-09-11 100.0 50.0 36 2
2 2014-09-12 102.0 55.0 46 1
3 2014-09-13 60.0 20.0 35 3
4 2014-09-14 60.0 42.8 32 3
5 2014-09-15 80.4 57.0 42 2

In [19]:
bad_dataset = pd.read_csv('a_loading_example_2.csv',
error_bad_lines=False)


Skipping line 4: expected 3 fields, saw 4

Dealing with big datasets


In [20]:
import pandas as pd
iris_chunks = pd.read_csv(iris_filename, header=None,
names=['C1', 'C2', 'C3', 'C4', 'C5'], chunksize=10)
for chunk in iris_chunks:
    print chunk.shape
    print chunk


(10, 5)
    C1   C2   C3   C4           C5
0  5.1  3.5  1.4  0.2  Iris-setosa
1  4.9  3.0  1.4  0.2  Iris-setosa
2  4.7  3.2  1.3  0.2  Iris-setosa
3  4.6  3.1  1.5  0.2  Iris-setosa
4  5.0  3.6  1.4  0.2  Iris-setosa
5  5.4  3.9  1.7  0.4  Iris-setosa
6  4.6  3.4  1.4  0.3  Iris-setosa
7  5.0  3.4  1.5  0.2  Iris-setosa
8  4.4  2.9  1.4  0.2  Iris-setosa
9  4.9  3.1  1.5  0.1  Iris-setosa
(10, 5)
    C1   C2   C3   C4           C5
0  5.4  3.7  1.5  0.2  Iris-setosa
1  4.8  3.4  1.6  0.2  Iris-setosa
2  4.8  3.0  1.4  0.1  Iris-setosa
3  4.3  3.0  1.1  0.1  Iris-setosa
4  5.8  4.0  1.2  0.2  Iris-setosa
5  5.7  4.4  1.5  0.4  Iris-setosa
6  5.4  3.9  1.3  0.4  Iris-setosa
7  5.1  3.5  1.4  0.3  Iris-setosa
8  5.7  3.8  1.7  0.3  Iris-setosa
9  5.1  3.8  1.5  0.3  Iris-setosa
(10, 5)
    C1   C2   C3   C4           C5
0  5.4  3.4  1.7  0.2  Iris-setosa
1  5.1  3.7  1.5  0.4  Iris-setosa
2  4.6  3.6  1.0  0.2  Iris-setosa
3  5.1  3.3  1.7  0.5  Iris-setosa
4  4.8  3.4  1.9  0.2  Iris-setosa
5  5.0  3.0  1.6  0.2  Iris-setosa
6  5.0  3.4  1.6  0.4  Iris-setosa
7  5.2  3.5  1.5  0.2  Iris-setosa
8  5.2  3.4  1.4  0.2  Iris-setosa
9  4.7  3.2  1.6  0.2  Iris-setosa
(10, 5)
    C1   C2   C3   C4           C5
0  4.8  3.1  1.6  0.2  Iris-setosa
1  5.4  3.4  1.5  0.4  Iris-setosa
2  5.2  4.1  1.5  0.1  Iris-setosa
3  5.5  4.2  1.4  0.2  Iris-setosa
4  4.9  3.1  1.5  0.1  Iris-setosa
5  5.0  3.2  1.2  0.2  Iris-setosa
6  5.5  3.5  1.3  0.2  Iris-setosa
7  4.9  3.1  1.5  0.1  Iris-setosa
8  4.4  3.0  1.3  0.2  Iris-setosa
9  5.1  3.4  1.5  0.2  Iris-setosa
(10, 5)
    C1   C2   C3   C4           C5
0  5.0  3.5  1.3  0.3  Iris-setosa
1  4.5  2.3  1.3  0.3  Iris-setosa
2  4.4  3.2  1.3  0.2  Iris-setosa
3  5.0  3.5  1.6  0.6  Iris-setosa
4  5.1  3.8  1.9  0.4  Iris-setosa
5  4.8  3.0  1.4  0.3  Iris-setosa
6  5.1  3.8  1.6  0.2  Iris-setosa
7  4.6  3.2  1.4  0.2  Iris-setosa
8  5.3  3.7  1.5  0.2  Iris-setosa
9  5.0  3.3  1.4  0.2  Iris-setosa
(10, 5)
    C1   C2   C3   C4               C5
0  7.0  3.2  4.7  1.4  Iris-versicolor
1  6.4  3.2  4.5  1.5  Iris-versicolor
2  6.9  3.1  4.9  1.5  Iris-versicolor
3  5.5  2.3  4.0  1.3  Iris-versicolor
4  6.5  2.8  4.6  1.5  Iris-versicolor
5  5.7  2.8  4.5  1.3  Iris-versicolor
6  6.3  3.3  4.7  1.6  Iris-versicolor
7  4.9  2.4  3.3  1.0  Iris-versicolor
8  6.6  2.9  4.6  1.3  Iris-versicolor
9  5.2  2.7  3.9  1.4  Iris-versicolor
(10, 5)
    C1   C2   C3   C4               C5
0  5.0  2.0  3.5  1.0  Iris-versicolor
1  5.9  3.0  4.2  1.5  Iris-versicolor
2  6.0  2.2  4.0  1.0  Iris-versicolor
3  6.1  2.9  4.7  1.4  Iris-versicolor
4  5.6  2.9  3.6  1.3  Iris-versicolor
5  6.7  3.1  4.4  1.4  Iris-versicolor
6  5.6  3.0  4.5  1.5  Iris-versicolor
7  5.8  2.7  4.1  1.0  Iris-versicolor
8  6.2  2.2  4.5  1.5  Iris-versicolor
9  5.6  2.5  3.9  1.1  Iris-versicolor
(10, 5)
    C1   C2   C3   C4               C5
0  5.9  3.2  4.8  1.8  Iris-versicolor
1  6.1  2.8  4.0  1.3  Iris-versicolor
2  6.3  2.5  4.9  1.5  Iris-versicolor
3  6.1  2.8  4.7  1.2  Iris-versicolor
4  6.4  2.9  4.3  1.3  Iris-versicolor
5  6.6  3.0  4.4  1.4  Iris-versicolor
6  6.8  2.8  4.8  1.4  Iris-versicolor
7  6.7  3.0  5.0  1.7  Iris-versicolor
8  6.0  2.9  4.5  1.5  Iris-versicolor
9  5.7  2.6  3.5  1.0  Iris-versicolor
(10, 5)
    C1   C2   C3   C4               C5
0  5.5  2.4  3.8  1.1  Iris-versicolor
1  5.5  2.4  3.7  1.0  Iris-versicolor
2  5.8  2.7  3.9  1.2  Iris-versicolor
3  6.0  2.7  5.1  1.6  Iris-versicolor
4  5.4  3.0  4.5  1.5  Iris-versicolor
5  6.0  3.4  4.5  1.6  Iris-versicolor
6  6.7  3.1  4.7  1.5  Iris-versicolor
7  6.3  2.3  4.4  1.3  Iris-versicolor
8  5.6  3.0  4.1  1.3  Iris-versicolor
9  5.5  2.5  4.0  1.3  Iris-versicolor
(10, 5)
    C1   C2   C3   C4               C5
0  5.5  2.6  4.4  1.2  Iris-versicolor
1  6.1  3.0  4.6  1.4  Iris-versicolor
2  5.8  2.6  4.0  1.2  Iris-versicolor
3  5.0  2.3  3.3  1.0  Iris-versicolor
4  5.6  2.7  4.2  1.3  Iris-versicolor
5  5.7  3.0  4.2  1.2  Iris-versicolor
6  5.7  2.9  4.2  1.3  Iris-versicolor
7  6.2  2.9  4.3  1.3  Iris-versicolor
8  5.1  2.5  3.0  1.1  Iris-versicolor
9  5.7  2.8  4.1  1.3  Iris-versicolor
(10, 5)
    C1   C2   C3   C4              C5
0  6.3  3.3  6.0  2.5  Iris-virginica
1  5.8  2.7  5.1  1.9  Iris-virginica
2  7.1  3.0  5.9  2.1  Iris-virginica
3  6.3  2.9  5.6  1.8  Iris-virginica
4  6.5  3.0  5.8  2.2  Iris-virginica
5  7.6  3.0  6.6  2.1  Iris-virginica
6  4.9  2.5  4.5  1.7  Iris-virginica
7  7.3  2.9  6.3  1.8  Iris-virginica
8  6.7  2.5  5.8  1.8  Iris-virginica
9  7.2  3.6  6.1  2.5  Iris-virginica
(10, 5)
    C1   C2   C3   C4              C5
0  6.5  3.2  5.1  2.0  Iris-virginica
1  6.4  2.7  5.3  1.9  Iris-virginica
2  6.8  3.0  5.5  2.1  Iris-virginica
3  5.7  2.5  5.0  2.0  Iris-virginica
4  5.8  2.8  5.1  2.4  Iris-virginica
5  6.4  3.2  5.3  2.3  Iris-virginica
6  6.5  3.0  5.5  1.8  Iris-virginica
7  7.7  3.8  6.7  2.2  Iris-virginica
8  7.7  2.6  6.9  2.3  Iris-virginica
9  6.0  2.2  5.0  1.5  Iris-virginica
(10, 5)
    C1   C2   C3   C4              C5
0  6.9  3.2  5.7  2.3  Iris-virginica
1  5.6  2.8  4.9  2.0  Iris-virginica
2  7.7  2.8  6.7  2.0  Iris-virginica
3  6.3  2.7  4.9  1.8  Iris-virginica
4  6.7  3.3  5.7  2.1  Iris-virginica
5  7.2  3.2  6.0  1.8  Iris-virginica
6  6.2  2.8  4.8  1.8  Iris-virginica
7  6.1  3.0  4.9  1.8  Iris-virginica
8  6.4  2.8  5.6  2.1  Iris-virginica
9  7.2  3.0  5.8  1.6  Iris-virginica
(10, 5)
    C1   C2   C3   C4              C5
0  7.4  2.8  6.1  1.9  Iris-virginica
1  7.9  3.8  6.4  2.0  Iris-virginica
2  6.4  2.8  5.6  2.2  Iris-virginica
3  6.3  2.8  5.1  1.5  Iris-virginica
4  6.1  2.6  5.6  1.4  Iris-virginica
5  7.7  3.0  6.1  2.3  Iris-virginica
6  6.3  3.4  5.6  2.4  Iris-virginica
7  6.4  3.1  5.5  1.8  Iris-virginica
8  6.0  3.0  4.8  1.8  Iris-virginica
9  6.9  3.1  5.4  2.1  Iris-virginica
(10, 5)
    C1   C2   C3   C4              C5
0  6.7  3.1  5.6  2.4  Iris-virginica
1  6.9  3.1  5.1  2.3  Iris-virginica
2  5.8  2.7  5.1  1.9  Iris-virginica
3  6.8  3.2  5.9  2.3  Iris-virginica
4  6.7  3.3  5.7  2.5  Iris-virginica
5  6.7  3.0  5.2  2.3  Iris-virginica
6  6.3  2.5  5.0  1.9  Iris-virginica
7  6.5  3.0  5.2  2.0  Iris-virginica
8  6.2  3.4  5.4  2.3  Iris-virginica
9  5.9  3.0  5.1  1.8  Iris-virginica

In [21]:
iris_iterator = pd.read_csv(iris_filename, header=None,
names=['C1', 'C2', 'C3', 'C4', 'C5'], iterator=True)

In [22]:
print iris_iterator.get_chunk(10).shape


(10, 5)

In [23]:
print iris_iterator.get_chunk(20).shape


(20, 5)

In [24]:
piece = iris_iterator.get_chunk(2)
piece


Out[24]:
C1 C2 C3 C4 C5
0 4.8 3.1 1.6 0.2 Iris-setosa
1 5.4 3.4 1.5 0.4 Iris-setosa

In [25]:
import csv
with open(iris_filename, 'rb') as data_stream:
    for n, row in enumerate(csv.DictReader(data_stream,
        fieldnames = ['sepal_length', 'sepal_width',
        'petal_length', 'petal_width', 'target'],
        dialect='excel')):
            if n== 0:
                print n,row
            else:
                break


0 {'sepal_width': '3.5', 'petal_width': '0.2', 'target': 'Iris-setosa', 'sepal_length': '5.1', 'petal_length': '1.4'}

In [26]:
with open(iris_filename, 'rb') as data_stream:
    for n, row in enumerate(csv.reader(data_stream,
        dialect='excel')):
            if n==0:
                print row
            else:
                break


['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']

In [27]:
def batch_read(filename, batch=5):
    # open the data stream
    with open(filename, 'rb') as data_stream:
        # reset the batch
        batch_output = list()
        # iterate over the file
        for n, row in enumerate(csv.reader(data_stream, dialect='excel')):
            # if the batch is of the right size
            if n > 0 and n % batch == 0:
                # yield back the batch as an ndarray
                yield(np.array(batch_output))
                # reset the batch and restart
                batch_output = list()
            # otherwise add the row to the batch
            batch_output.append(row)
        # when the loop is over, yield what's left
        yield(np.array(batch_output))

In [28]:
import numpy as np
for batch_input in batch_read(iris_filename, batch=3):
    print batch_input
    break


[['5.1' '3.5' '1.4' '0.2' 'Iris-setosa']
 ['4.9' '3.0' '1.4' '0.2' 'Iris-setosa']
 ['4.7' '3.2' '1.3' '0.2' 'Iris-setosa']]

Accessing other data formats


In [29]:
import pandas as pd
my_own_dataset = pd.DataFrame({'Col1': range(5), 'Col2':
[1.0]*5, 'Col3': 1.0, 'Col4': 'Hello World!'})
my_own_dataset


Out[29]:
Col1 Col2 Col3 Col4
0 0 1.0 1.0 Hello World!
1 1 1.0 1.0 Hello World!
2 2 1.0 1.0 Hello World!
3 3 1.0 1.0 Hello World!
4 4 1.0 1.0 Hello World!

In [30]:
my_wrong_own_dataset = pd.DataFrame({'Col1': range(5), 'Col2':
'string', 'Col3': range(2)})


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-30-472dc6b904c3> in <module>()
      1 my_wrong_own_dataset = pd.DataFrame({'Col1': range(5), 'Col2':
----> 2 'string', 'Col3': range(2)})

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
    222                                  dtype=dtype, copy=copy)
    223         elif isinstance(data, dict):
--> 224             mgr = self._init_dict(data, index, columns, dtype=dtype)
    225         elif isinstance(data, ma.MaskedArray):
    226             import numpy.ma.mrecords as mrecords

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _init_dict(self, data, index, columns, dtype)
    358             arrays = [data[k] for k in keys]
    359 
--> 360         return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
    361 
    362     def _init_ndarray(self, values, index, columns, dtype=None, copy=False):

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
   5229     # figure out the index, if necessary
   5230     if index is None:
-> 5231         index = extract_index(arrays)
   5232     else:
   5233         index = _ensure_index(index)

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in extract_index(data)
   5277             lengths = list(set(raw_lengths))
   5278             if len(lengths) > 1:
-> 5279                 raise ValueError('arrays must all be same length')
   5280 
   5281             if have_dicts:

ValueError: arrays must all be same length

In [31]:
my_own_dataset.dtypes


Out[31]:
Col1      int64
Col2    float64
Col3    float64
Col4     object
dtype: object

In [32]:
my_own_dataset['Col1'] = my_own_dataset['Col1'].astype(float)
my_own_dataset.dtypes


Out[32]:
Col1    float64
Col2    float64
Col3    float64
Col4     object
dtype: object

Data preprocessing


In [33]:
mask_feature = iris['sepal_length'] > 6.0
mask_feature


Out[33]:
0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
120     True
121    False
122     True
123     True
124     True
125     True
126     True
127     True
128     True
129     True
130     True
131     True
132     True
133     True
134     True
135     True
136     True
137     True
138    False
139     True
140     True
141     True
142    False
143     True
144     True
145     True
146     True
147     True
148     True
149    False
Name: sepal_length, dtype: bool

In [34]:
mask_target = iris['target'] == 'Iris-virginica'

In [35]:
iris.loc[mask_target, 'target'] = 'New label'

In [36]:
iris['target'].unique()


Out[36]:
array(['Iris-setosa', 'Iris-versicolor', 'New label'], dtype=object)

In [37]:
grouped_targets_mean = iris.groupby(['target']).mean()
grouped_targets_mean


Out[37]:
sepal_length sepal_width petal_length petal_width
target
Iris-setosa 5.006 3.418 1.464 0.244
Iris-versicolor 5.936 2.770 4.260 1.326
New label 6.588 2.974 5.552 2.026

In [38]:
grouped_targets_var = iris.groupby(['target']).var()
grouped_targets_var


Out[38]:
sepal_length sepal_width petal_length petal_width
target
Iris-setosa 0.124249 0.145180 0.030106 0.011494
Iris-versicolor 0.266433 0.098469 0.220816 0.039106
New label 0.404343 0.104004 0.304588 0.075433

In [39]:
iris.sort_index(by='sepal_length').head()


/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
  if __name__ == '__main__':
Out[39]:
sepal_length sepal_width petal_length petal_width target
13 4.3 3.0 1.1 0.1 Iris-setosa
42 4.4 3.2 1.3 0.2 Iris-setosa
38 4.4 3.0 1.3 0.2 Iris-setosa
8 4.4 2.9 1.4 0.2 Iris-setosa
41 4.5 2.3 1.3 0.3 Iris-setosa

In [40]:
# This is just an example, with no time_series data
# smooth_time_series = pd.rolling_mean(time_series, 5)

In [41]:
# This is just an example, with no time_series data
# median_time_series = pd.rolling_median(time_series, 5)

Data selection


In [42]:
import pandas as pd
dataset = pd.read_csv('a_selection_example_1.csv')
dataset


Out[42]:
n val1 val2 val3
0 100 10 10 C
1 101 10 20 C
2 102 10 30 B
3 103 10 40 B
4 104 10 50 A

In [43]:
dataset = pd.read_csv('a_selection_example_1.csv', index_col=0)
dataset


Out[43]:
val1 val2 val3
n
100 10 10 C
101 10 20 C
102 10 30 B
103 10 40 B
104 10 50 A

In [44]:
dataset['val3'][104]


Out[44]:
'A'

In [45]:
dataset.loc[104, 'val3']


Out[45]:
'A'

In [46]:
dataset.ix[104, 'val3']


Out[46]:
'A'

In [47]:
dataset.ix[104, 2]


Out[47]:
'A'

In [48]:
dataset.iloc[4, 2]


Out[48]:
'A'

In [49]:
dataset[['val3', 'val2']][0:2]


Out[49]:
val3 val2
n
100 C 10
101 C 20

In [50]:
dataset.loc[range(100, 102), ['val3', 'val2']]


Out[50]:
val3 val2
n
100 C 10
101 C 20

In [51]:
dataset.ix[range(100, 102), ['val3', 'val2']]


Out[51]:
val3 val2
n
100 C 10
101 C 20

In [52]:
dataset.ix[range(100, 102), [2, 1]]


Out[52]:
val3 val2
n
100 C 10
101 C 20

In [53]:
dataset.iloc[range(2), [2,1]]


Out[53]:
val3 val2
n
100 C 10
101 C 20

Working with categorical and textual data


In [54]:
import pandas as pd
categorical_feature = pd.Series(['sunny', 'cloudy', 'snowy',
'rainy', 'foggy'])
mapping = pd.get_dummies(categorical_feature)
mapping


Out[54]:
cloudy foggy rainy snowy sunny
0 0.0 0.0 0.0 0.0 1.0
1 1.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 1.0 0.0
3 0.0 0.0 1.0 0.0 0.0
4 0.0 1.0 0.0 0.0 0.0

In [55]:
mapping['sunny']


Out[55]:
0    1.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: sunny, dtype: float64

In [56]:
mapping['cloudy']


Out[56]:
0    0.0
1    1.0
2    0.0
3    0.0
4    0.0
Name: cloudy, dtype: float64

In [57]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
ohe = OneHotEncoder()
levels = ['sunny', 'cloudy', 'snowy', 'rainy', 'foggy']
fit_levs = le.fit_transform(levels)
ohe.fit([[fit_levs[0]], [fit_levs[1]], [fit_levs[2]], [fit_levs[3]],
[fit_levs[4]]])
print ohe.transform([le.transform(['sunny'])]).toarray()
print ohe.transform([le.transform(['cloudy'])]).toarray()


[[ 0.  0.  0.  0.  1.]]
[[ 1.  0.  0.  0.  0.]]

A special type of data–text


In [60]:
from sklearn.datasets import fetch_20newsgroups
categories = ['sci.med', 'sci.space']
twenty_sci_news = fetch_20newsgroups(categories=categories)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-60-e8ca55a56690> in <module>()
      1 from sklearn.datasets import fetch_20newsgroups
      2 categories = ['sci.med', 'sci.space']
----> 3 twenty_sci_news = fetch_20newsgroups(categories=categories)

/home/moonbury/.local/lib/python2.7/site-packages/sklearn/datasets/twenty_newsgroups.pyc in fetch_20newsgroups(data_home, subset, categories, shuffle, random_state, remove, download_if_missing)
    221         if download_if_missing:
    222             cache = download_20newsgroups(target_dir=twenty_home,
--> 223                                           cache_path=cache_path)
    224         else:
    225             raise IOError('20Newsgroups dataset not found')

/home/moonbury/.local/lib/python2.7/site-packages/sklearn/datasets/twenty_newsgroups.pyc in download_20newsgroups(target_dir, cache_path)
     94 
     95     logger.info("Decompressing %s", archive_path)
---> 96     tarfile.open(archive_path, "r:gz").extractall(path=target_dir)
     97     os.remove(archive_path)
     98 

/usr/lib/python2.7/tarfile.pyc in extractall(self, path, members)
   2049                 tarinfo = copy.copy(tarinfo)
   2050                 tarinfo.mode = 0700
-> 2051             self.extract(tarinfo, path)
   2052 
   2053         # Reverse sort directories.

/usr/lib/python2.7/tarfile.pyc in extract(self, member, path)
   2086 
   2087         try:
-> 2088             self._extract_member(tarinfo, os.path.join(path, tarinfo.name))
   2089         except EnvironmentError, e:
   2090             if self.errorlevel > 0:

/usr/lib/python2.7/tarfile.pyc in _extract_member(self, tarinfo, targetpath)
   2179         if not tarinfo.issym():
   2180             self.chmod(tarinfo, targetpath)
-> 2181             self.utime(tarinfo, targetpath)
   2182 
   2183     #--------------------------------------------------------------------------

/usr/lib/python2.7/tarfile.pyc in utime(self, tarinfo, targetpath)
   2300             return
   2301         try:
-> 2302             os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
   2303         except EnvironmentError, e:
   2304             raise ExtractError("could not change modification time")

KeyboardInterrupt: 

In [59]:
twenty_sci_news.data[0]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-59-34ca0c8e66a8> in <module>()
----> 1 twenty_sci_news.data[0]

NameError: name 'twenty_sci_news' is not defined

In [ ]:
twenty_sci_news.filenames

In [ ]:
print twenty_sci_news.target[0]
print twenty_sci_news.target_names[twenty_sci_news.target[0]]

In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
word_count = count_vect.fit_transform(twenty_sci_news.data)
word_count.shape

In [ ]:
print word_count[0]

In [ ]:
word_list = count_vect.get_feature_names()
for n in word_count[0].indices:
    print "Word:", word_list[n], "appears", word_count[0, n], "times"

In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vect = TfidfVectorizer(use_idf=False, norm='l1')
word_freq = tf_vect.fit_transform(twenty_sci_news.data)
word_list = tf_vect.get_feature_names()
for n in word_freq[0].indices:
    print "Word:", word_list[n], "has frequency", word_freq[0, n]

In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer() # Default: use_idf=True
word_tfidf = tfidf_vect.fit_transform(twenty_sci_news.data)
word_list = tfidf_vect.get_feature_names()
for n in word_tfidf[0].indices:
    print "Word:", word_list[n], "has tfidf", word_tfidf[0, n]

In [ ]:
text_1 = 'we love data science'
text_2 = 'data science is hard'
documents = [text_1, text_2]
documents

In [ ]:
# That is what we say above, the default one
count_vect_1_grams = CountVectorizer(ngram_range=(1, 1),
stop_words=[], min_df=1)
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print "Word list = ", word_list
print "text_1 is described with", [word_list[n] + "(" +
str(word_count[0, n]) + ")" for n in word_count[0].indices]

In [ ]:
# Now a bi-gram count vectorizer
count_vect_1_grams = CountVectorizer(ngram_range=(2, 2))
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print "Word list = ", word_list
print "text_1 is described with", [word_list[n] + "(" +
str(word_count[0, n]) + ")" for n in word_count[0].indices]

In [ ]:
# Now a uni- and bi-gram count vectorizer
count_vect_1_grams = CountVectorizer(ngram_range=(1, 2))
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print "Word list = ", word_list
print "text_1 is described with", [word_list[n] + "(" +
str(word_count[0, n]) + ")" for n in word_count[0].indices]

In [ ]:
from sklearn.feature_extraction.text import HashingVectorizer
hash_vect = HashingVectorizer(n_features=1000)
word_hashed = hash_vect.fit_transform(twenty_sci_news.data)
word_hashed.shape

Creating NumPy arrays

From lists to unidimensional arrays


In [ ]:
import numpy as np
# Transform a list into a uni-dimensional array
list_of_ints = [1,2,3]
Array_1 = np.array(list_of_ints)
Array_1

In [ ]:
Array_1[1] # let's output the second value

In [ ]:
type(Array_1)

In [ ]:
Array_1.dtype # Note: The default dtype depends on the system you're operating.

Controlling the memory size


In [ ]:
import numpy as np
Array_1.nbytes # Please note that on 64bit platforms the result will be 24.

In [ ]:
Array_1 = np.array(list_of_ints, dtype= 'int8')

In [ ]:
Array_1b = Array_1.astype('float32')
Array_1b

Heterogeneous lists


In [ ]:
import numpy as np
complex_list = [1,2,3] + [1.,2.,3.] + ['a','b','c']
Array_2 = np.array(complex_list[:3]) # at first the input list is just ints
print 'complex_list[:3]', Array_2.dtype
Array_2 = np.array(complex_list[:6]) # then it is ints and floats
print 'complex_list[:6]', Array_2.dtype
Array_2 = np.array(complex_list) # finally we add strings
print 'complex_list[:] ',Array_2.dtype

In [ ]:
# Check if a NumPy array is of the desired numeric type
print isinstance(Array_2[0],np.number)

From lists to multidimensional arrays


In [ ]:
import numpy as np
# Transform a list into a bidimensional array
a_list_of_lists = [[1,2,3],[4,5,6],[7,8,9]]
Array_2D = np.array(a_list_of_lists )
Array_2D

In [ ]:
Array_2D[1,1]

In [ ]:
# Transform a list into a multi-dimensional array
a_list_of_lists_of_lists = [[[1,2],[3,4],[5,6]],
[[7,8],[9,10],[11,12]]]
Array_3D = np.array(a_list_of_lists_of_lists)
Array_3D

In [ ]:
Array_3D[0,2,0] # Accessing the 5th element

In [ ]:
np.array({1:2,3:4,5:6}.items())

Resizing arrays


In [ ]:
import numpy as np
# Restructuring a NumPy array shape
original_array = np.array([1, 2, 3, 4, 5, 6, 7, 8])
Array_a = original_array.reshape(4,2)
Array_b = original_array.reshape(4,2).copy()
Array_c = original_array.reshape(2,2,2)
# Attention because reshape creates just views, not copies
original_array[0] = -1

In [ ]:
Array_a

In [ ]:
Array_c

In [ ]:
Array_b

In [ ]:
original_array.resize(4,2)
original_array

In [ ]:
original_array.shape = (4,2)

In [ ]:
original_array

Arrays derived from NumPy functions


In [ ]:
import numpy as np
ordinal_values = np.arange(9).reshape(3,3)
ordinal_values

In [ ]:
np.arange(9)[::-1]

In [ ]:
np.random.randint(low=1,high=10,size=(3,3)).reshape(3,3)

In [ ]:
np.zeros((3,3))

In [ ]:
np.ones((3,3))

In [ ]:
np.eye(3)

In [ ]:
fractions = np.linspace(start=0, stop=1, num=10)
fractions

In [ ]:
growth = np.logspace(start=0, stop=1, num=10, base=10.0)
growth

In [ ]:
std_gaussian = np.random.normal(size=(3,3))
std_gaussian

In [ ]:
gaussian = np.random.normal(loc=1.0, scale= 3.0, size=(3,3))
gaussian

In [ ]:
np.random.uniform(low=0.0, high=1.0, size=(3,3))

Getting an array directly from a file


In [ ]:
import numpy as np
housing = np.loadtxt('regression-datasets-housing.csv',delimiter=',', dtype=float)

In [ ]:
np.loadtxt('datasets-uci-iris.csv',delimiter=',',dtype=float)

Extracting data from pandas


In [ ]:
import pandas as pd
import numpy as np
housing_filename = 'regression-datasets-housing.csv'
housing = pd.read_csv(housing_filename, header=None)

In [ ]:
housing_array = housing.values
housing_array.dtype

In [ ]:
housing.dtypes

NumPy fast operation and computations


In [ ]:
import numpy as np
a = np.arange(5).reshape(1,5)
a += 1
a*a

In [ ]:
a = np.arange(5).reshape(1,5) + 1
b = np.arange(5).reshape(5,1) + 1
a * b

In [ ]:
a2 = np.array([1,2,3,4,5] * 5).reshape(5,5)
b2 = a2.T
a2 * b2

In [ ]:
print a2

In [ ]:
np.sum(a2, axis=0)

In [ ]:
np.sum(a2, axis=1)

In [ ]:
%timeit -n 1 -r 3 [i+1.0 for i in range(10**6)]
%timeit -n 1 -r 3 np.arange(10**6)+1.0

In [ ]:
import math
%timeit -n 1 -r 3 [math.sqrt(i) for i in range(10**6)]

In [ ]:
%timeit -n 1 -r 3 np.sqrt(np.arange(10**6))

Matrix operations


In [ ]:
import numpy as np
M = np.arange(5*5, dtype=float).reshape(5,5)
M

In [ ]:
coefs = np.array([1., 0.5, 0.5, 0.5, 0.5])
coefs_matrix = np.column_stack((coefs,coefs[::-1]))
print coefs_matrix

In [ ]:
np.dot(M,coefs)

In [ ]:
np.dot(coefs,M)

In [ ]:
np.dot(M,coefs_matrix)

Slicing and indexing with NumPy arrays


In [ ]:
import numpy as np
M = np.arange(10*10, dtype=int).reshape(10,10)

In [ ]:
M[2:9:2,:]

In [ ]:
M[2:9:2,5:]

In [ ]:
M[2:9:2,5::-1]

In [ ]:
# In the book the output of this cell is wrong.
# Here is reported the correct output.

row_index = (M[:,0]>=20) & (M[:,0]<=80)
col_index = M[0,:]>=5
M[row_index,:][:,col_index]

In [ ]:
mask = (M>=20) & (M<=90) & ((M / 10.) % 1 >= 0.5)
M[mask]

In [ ]:
row_index = [1,1,2,7]
col_index = [0,2,4,8]

In [ ]:
M[row_index,col_index]

In [ ]:
M[row_index,:][:,col_index]

In [ ]:
N = M[2:9:2,5:].copy()

Stacking NumPy arrays


In [ ]:
import numpy as np
dataset = np.arange(10*5).reshape(10,5)

In [ ]:
single_line = np.arange(1*5).reshape(1,5)
a_few_lines = np.arange(3*5).reshape(3,5)

In [ ]:
np.vstack((dataset,single_line))

In [ ]:
np.vstack((dataset,a_few_lines))

In [ ]:
np.vstack((dataset,single_line,single_line))

In [ ]:
bias = np.ones(10).reshape(10,1)
np.hstack((dataset,bias))

In [ ]:
bias = np.ones(10)
np.column_stack((dataset,bias))

In [ ]:
np.dstack((dataset*1,dataset*2,dataset*3))

In [ ]:
np.insert(dataset, 3, bias, axis=1)

In [ ]:
np.insert(dataset, 3, dataset.T, axis=1)

In [ ]:
np.insert(dataset, 3, np.ones(5), axis=0)